COVID 19 Analysis in India¶

This dataset depicts the daily confirmed and daily deaths recorded due to COVID-19 in India.

In [ ]:
import pandas as pd
import numpy as np
data = pd.read_csv("covid19_india.csv")
print(data.head())
data.describe()
              Date    Date_YMD  Daily Confirmed  Daily Deceased
0  30 January 2020  2020-01-30                1               0
1  31 January 2020  2020-01-31                0               0
2  1 February 2020  2020-02-01                0               0
3  2 February 2020  2020-02-02                1               0
4  3 February 2020  2020-02-03                1               0
Out[ ]:
Daily Confirmed Daily Deceased
count 720.000000 720.000000
mean 52637.915278 675.901389
std 78522.746943 932.895333
min 0.000000 0.000000
25% 10419.250000 159.000000
50% 27383.000000 387.000000
75% 54300.750000 734.750000
max 414280.000000 6139.000000
In [ ]:
import plotly.express as px
fig = px.bar(data, x='Date_YMD', y='Daily Confirmed')
fig.show(renderer='notebook') #fig.show(renderer='notebook') Use this instead of fig.show() to export plots
In [ ]:
fig = px.bar(data, x='Date_YMD', y='Daily Deceased')
fig.show(renderer='notebook')

Overlaying the Confirmed and Death graphs over each other to see trends¶

In [ ]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create figure with subplots
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces for Daily Confirmed and Daily Deceased
fig.add_trace(
    go.Bar(x=data['Date_YMD'], y=data['Daily Confirmed'], name='Daily Confirmed'),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(x=data['Date_YMD'], y=data['Daily Deceased'], name='Daily Deceased'),
    secondary_y=True,
)

# Update layout
fig.update_layout(
    title_text="Daily Confirmed and Daily Deceased Cases",
    xaxis_title="Date",
    yaxis_title="Daily Confirmed Cases",
    yaxis2_title="Daily Deceased Cases",
)

# Show figure
fig.show(renderer='notebook')

Checking Linear regression between Confirmed Cases and Deaths¶

In [ ]:
from sklearn.linear_model import LinearRegression
import numpy as np

# Assume 'data' contains your dataset

# Extracting features and target variable
X = data[['Daily Confirmed']]
y = data['Daily Deceased']

# Initialize and fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Get the coefficients
coef = model.coef_[0]
intercept = model.intercept_

# Print the coefficients
print("Coefficients: ", coef)
print("Intercept: ", intercept)
Coefficients:  0.009364690596853669
Intercept:  182.96359864910318

Scatterplot to check the fit of Confirmed Cases and Deaths¶

In [ ]:
import matplotlib.pyplot as plt

# Scatter plot
plt.scatter(data['Daily Confirmed'], data['Daily Deceased'], color='blue', label='Actual Data')

# Plotting the regression line
plt.plot(X, model.predict(X), color='red', linewidth=2, label='Regression Line')

plt.xlabel('Daily Confirmed Cases')
plt.ylabel('Daily Deceased Cases')
plt.title('Linear Regression: Daily Confirmed vs Daily Deceased Cases')
plt.legend()
plt.show()
No description has been provided for this image

Doing a Logistic Regression to answer whether confirmed cases lead to Deaths¶

In [ ]:
# Calculate mean and median of 'Daily Deceased' cases
mean_deceased = data['Daily Deceased'].mean()
median_deceased = data['Daily Deceased'].median()

threshold = mean_deceased  # or median_deceased

print("Threshold:", threshold)
Threshold: 675.9013888888888
In [ ]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Defining features and target variable
X = data[['Daily Confirmed']]
y = (data['Daily Deceased'] > threshold).astype(int)  # Assuming 'threshold' is predefined

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the logistic regression model
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)

# Predicting on the testing set
y_pred = log_reg_model.predict(X_test)

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.2777777777777778
In [ ]:
import numpy as np
import matplotlib.pyplot as plt

# Scatter plot of actual data
plt.scatter(X_test.values.flatten(), y_test, color='blue', label='Actual Data')

# Plotting the decision boundary
x_values = np.linspace(X.min(), X.max(), 100)
y_values = log_reg_model.predict_proba(x_values.reshape(-1, 1))[:, 1]
plt.plot(x_values, y_values, color='red', label='Decision Boundary')

plt.xlabel('Daily Confirmed Cases')
plt.ylabel('Probability of Daily Deceased Cases > Threshold')
plt.title('Logistic Regression: Decision Boundary')
plt.legend()
plt.show()
c:\Users\Vikas\miniconda3\envs\myenv\lib\site-packages\sklearn\base.py:464: UserWarning:

X does not have valid feature names, but LogisticRegression was fitted with feature names

No description has been provided for this image